{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "688d43e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "import zipfile\n",
    "import os\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))\n",
    "\n",
    "\n",
    "\n",
    "def loop(files):\n",
    "    files, _ = files\n",
    "    for zip_file_path in tqdm(files):\n",
    "        destination_folder = './'\n",
    "        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
    "            zip_ref.extractall(destination_folder)\n",
    "        # os.remove(zip_file_path)\n",
    "\n",
    "# multiprocessing(files, loop, cores = min(len(files), 20), returned = False)\n",
    "# files = glob('TAU-*.zip')\n",
    "# multiprocessing(files, loop, cores = min(len(files), 20), returned = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a411fcbb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "audio  evaluation_setup  LICENSE  meta.csv  README.html  README.md\r\n"
     ]
    }
   ],
   "source": [
    "!ls TAU-urban-acoustic-scenes-2022-mobile-development"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "435d9050",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "230350"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = glob('TAU-urban-acoustic-scenes-2022-mobile-development/audio/*')\n",
    "len(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "38ca1856",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'TAU-urban-acoustic-scenes-2022-mobile-development/audio/park-paris-98-2761-4-a.wav'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "23da8b02",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir tau-2022-audio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "c71f48f3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'filename': 'audio/airport-lisbon-1000-40000-0-a.wav',\n",
       " 'scene_label': 'airport',\n",
       " 'identifier': 'lisbon-1000',\n",
       " 'source_label': 'a'}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import soundfile as sf\n",
    "import librosa\n",
    "\n",
    "df = pd.read_csv('TAU-urban-acoustic-scenes-2022-mobile-development/meta.csv', sep = '\\t').to_dict(orient = 'records')\n",
    "df[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5fe2450f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"['street_traffic', 'street_pedestrian', 'public_square', 'shopping_mall', 'metro', 'airport', 'metro_station', 'bus', 'park', 'tram']\""
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = [r['scene_label'] for r in df]\n",
    "labels = str(list(set(labels)))\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "2cb997d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import json\n",
    "questions = [\n",
    "    'given the labels\\n{labels}\\n\\nclassify the audio',\n",
    "    'what is the label for audio\\n\\nthe labels: {labels}'\n",
    "]\n",
    "\n",
    "def loop(rows):\n",
    "    rows, _ = rows\n",
    "    data = []\n",
    "    for r in tqdm(rows):\n",
    "        f = os.path.join('TAU-urban-acoustic-scenes-2022-mobile-development', r['filename'])\n",
    "        audio_filename = os.path.join('tau-2022-audio', f.replace('/', '_')).replace('.wav', '.mp3')\n",
    "        \n",
    "        if not os.path.exists(audio_filename):\n",
    "            y, sr = librosa.load(f, sr = 16000)\n",
    "            sf.write(audio_filename, y, sr)\n",
    "        \n",
    "        data.append({\n",
    "            'question': random.choice(questions).format(labels=labels),\n",
    "            'answer': r['scene_label'],\n",
    "            'audio_filename': audio_filename,\n",
    "            'metadata': json.dumps(r),\n",
    "        })\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "e7bb6313",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 17848.10it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = loop((df[:10], 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "73991a73",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': \"what is the label for audio\\n\\nthe labels: ['street_traffic', 'street_pedestrian', 'public_square', 'shopping_mall', 'metro', 'airport', 'metro_station', 'bus', 'park', 'tram']\",\n",
       " 'answer': 'airport',\n",
       " 'audio_filename': 'tau-2022-audio/TAU-urban-acoustic-scenes-2022-mobile-development_audio_airport-lisbon-1000-40000-0-a.mp3',\n",
       " 'metadata': '{\"filename\": \"audio/airport-lisbon-1000-40000-0-a.wav\", \"scene_label\": \"airport\", \"identifier\": \"lisbon-1000\", \"source_label\": \"a\"}'}"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "92214a10",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 119.46it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 115.26it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.37it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 119.24it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 118.85it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.50it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.11it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.89it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 118.87it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 119.26it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 118.94it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 119.05it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.70it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.06it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.68it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.11it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:38<00:00, 117.13it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.80it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.57it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.82it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.74it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = multiprocessing(df, loop, cores = 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "3d147248",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "230350"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "cb06522b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': \"what is the label for audio\\n\\nthe labels: ['street_traffic', 'street_pedestrian', 'public_square', 'shopping_mall', 'metro', 'airport', 'metro_station', 'bus', 'park', 'tram']\",\n",
       " 'answer': 'airport',\n",
       " 'audio_filename': 'tau-2022-audio/TAU-urban-acoustic-scenes-2022-mobile-development_audio_airport-lisbon-1000-40000-0-a.mp3',\n",
       " 'metadata': '{\"filename\": \"audio/airport-lisbon-1000-40000-0-a.wav\", \"scene_label\": \"airport\", \"identifier\": \"lisbon-1000\", \"source_label\": \"a\"}'}"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "cccb5713",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "a002aad9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "44feb335c80840ea9a90823f7186b4ba",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "df47d2f7f9e64a4f9b257e23247808c9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/231 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b2f43cffc28c480b9b385a594b9daa8e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/5.59M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c5a254c9cb7b45cbad1c664c24665431",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/3.45k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/80352deea4eed4770fefb65b8b324309bb6806f5', commit_message='Upload dataset', commit_description='', oid='80352deea4eed4770fefb65b8b324309bb6806f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'tau2022')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "f336e0a9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "230350"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "audio_files = glob('tau-2022-audio/*.mp3')\n",
    "len(audio_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "f8390af2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "\n",
    "with zipfile.ZipFile('tau-2022-audio.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    "    for f in audio_files:\n",
    "        zipf.write(f, arcname=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "9f64d4f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploading files using Xet Storage..\n",
      "Uploading...: 100%|████████████████████████| 1.23G/1.23G [00:25<00:00, 48.9MB/s]\n",
      "https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/tau-2022-audio.zip\n"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions tau-2022-audio.zip \\\n",
    "--repo-type=dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "31aa4f73",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm TAU-urban-acoustic-scenes-2022-mobile-development*.zip"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
